{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bfc6efd35877482fab97a58798a178be",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading readme:   0%|          | 0.00/7.83k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b457b4b3b2eb4fa1a1e12b17731baea6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3f38a9d9ea4f4d658c68d790e2137930",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9942411e672c41038f5886d0bec29322",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cbaf88354da04305bd3614875e301e23",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c6c6a0c016b348d89495411c1d880f56",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d89f137ce70440bb89364597d18e52e4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DatasetDict({\n",
      "    train: Dataset({\n",
      "        features: ['id', 'title', 'context', 'question', 'answers'],\n",
      "        num_rows: 87599\n",
      "    })\n",
      "    validation: Dataset({\n",
      "        features: ['id', 'title', 'context', 'question', 'answers'],\n",
      "        num_rows: 10570\n",
      "    })\n",
      "})\n"
     ]
    }
   ],
   "source": [
    "dataset = datasets.load_dataset(\"squad\")\n",
    "print(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = dataset[\"train\"]\n",
    "eval_data = dataset[\"validation\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_texts = [\n",
    "    f\"Question: {q} Answer: {a['text'][0]}\"\n",
    "    for q, a in zip(train_data[\"question\"], train_data[\"answers\"])\n",
    "]\n",
    "eval_texts = [\n",
    "    f\"Question: {q} Answer: \"\n",
    "    for q, a in zip(eval_data[\"question\"], eval_data[\"answers\"])\n",
    "]\n",
    "eval_answers = [a[\"text\"][0] for a in eval_data[\"answers\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? Answer: Saint Bernadette Soubirous'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_texts[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Question: Which NFL team represented the AFC at Super Bowl 50? Answer: Denver Broncos'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eval_texts[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Question: To whom did the Virgin Mary allegedl...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Question: What is in front of the Notre Dame M...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Question: The Basilica of the Sacred heart at ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Question: What is the Grotto at Notre Dame? An...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Question: What sits on top of the Main Buildin...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87594</th>\n",
       "      <td>Question: In what US state did Kathmandu first...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87595</th>\n",
       "      <td>Question: What was Yangon previously known as?...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87596</th>\n",
       "      <td>Question: With what Belorussian city does Kath...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87597</th>\n",
       "      <td>Question: In what year did Kathmandu create it...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>87598</th>\n",
       "      <td>Question: What is KMC an initialism of? Answer...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>87599 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    text\n",
       "0      Question: To whom did the Virgin Mary allegedl...\n",
       "1      Question: What is in front of the Notre Dame M...\n",
       "2      Question: The Basilica of the Sacred heart at ...\n",
       "3      Question: What is the Grotto at Notre Dame? An...\n",
       "4      Question: What sits on top of the Main Buildin...\n",
       "...                                                  ...\n",
       "87594  Question: In what US state did Kathmandu first...\n",
       "87595  Question: What was Yangon previously known as?...\n",
       "87596  Question: With what Belorussian city does Kath...\n",
       "87597  Question: In what year did Kathmandu create it...\n",
       "87598  Question: What is KMC an initialism of? Answer...\n",
       "\n",
       "[87599 rows x 1 columns]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df = pd.DataFrame(train_texts, columns=[\"text\"])\n",
    "eval_df = pd.DataFrame({\"text\": eval_texts, \"answer\": eval_answers})\n",
    "\n",
    "train_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.to_json(\"../data/squad-train.jsonl\", orient=\"records\", lines=True)\n",
    "eval_df.to_json(\"../data/squad-eval.jsonl\", orient=\"records\", lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df[\"rag_query\"] = train_data[\"question\"]\n",
    "eval_df[\"rag_query\"] = eval_data[\"question\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.to_json(\"../data/squad-train.jsonl\", orient=\"records\", lines=True)\n",
    "eval_df.to_json(\"../data/squad-eval.jsonl\", orient=\"records\", lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "st",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
